library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(stringr)
library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(RSentiment)
#trace("calculate_score",edit=TRUE)
source(file="Data/Analyze_Training_Sets.R")
source(file="Data/merge_data.R")
Suicidal posts are more wordy than non-suicidal ones, so this looks like a good predictor.
wordcount <- function(str) {
sapply(gregexpr("\\b\\W+\\b", str, perl=TRUE), function(x) sum(x>0) ) + 1
}
complete<-complete %>%
mutate(wc = wordcount(text))
save(complete,file = "complete.RData")
I tried more complex sentiments, using the nrc library. The main differences seemed to be in positive and negative sentiments, though.
load(file = "complete.RData")
#Get the tokens out of the posts
post.tok <- complete %>%
mutate(linenumber=row_number()) %>%
unnest_tokens(word,text)
nrc <- sentiments %>%
filter(lexicon == "nrc") %>%
dplyr::select(word, sentiment)
#Label the words with the sentiments using an inner join
#with the nrc sentiments
post.sent = post.tok %>% inner_join(nrc)
## Joining, by = "word"
#Make a table of the sentiments
# Table - only 10 sentiments in posts from
#suicidal users
sui.sent <- post.sent %>%
filter(suicidal=='suicidal') %>%
group_by(sentiment) %>%
summarize(n=n()) %>%
filter(n>10) %>%
arrange(desc(n))
# Table - 10 most common sentiments in posts NOT in r/depression
# from from suicidal users
not.sui.sent <- post.sent %>%
filter(suicidal=='not suicidal') %>%
group_by(sentiment) %>%
summarize(n=n()) %>%
filter(n>10) %>%
arrange(desc(n))
#not.sui.sent[1:10,]
comparison <- sui.sent %>%
rename(sui = n) %>%
inner_join(not.sui.sent,by="sentiment") %>%
rename(not.sui = n) %>%
mutate(sui = sui / sum(sui),
not.sui = not.sui / sum(not.sui),diff=sui-not.sui) %>%
arrange(diff)
sui.sent.for.merge <- comparison[1:10,1:2] %>%
mutate(suicidal = "suicidal")
not.sui.sent.for.merge <- comparison[1:10,c(1,3)] %>%
mutate(suicidal = "not suicidal")
colnames(sui.sent.for.merge)<-c("sentiment","value","suicidal")
colnames(not.sui.sent.for.merge)<-c("sentiment","value","suicidal")
plot.sentiments <-rbind(sui.sent.for.merge,not.sui.sent.for.merge)
plot.sentiments$sentiment <- factor(plot.sentiments$sentiment,
levels = plot.sentiments$sentiment[order(comparison[1:10,]$diff)])
plot.sentiments$sentiment # notice the changed order of factor levels
## [1] positive anticipation joy trust surprise
## [6] anger disgust fear sadness negative
## [11] positive anticipation joy trust surprise
## [16] anger disgust fear sadness negative
## 10 Levels: positive anticipation joy trust surprise anger disgust ... negative
ggplot(plot.sentiments, aes(sentiment, value)) +
geom_bar(aes(fill = suicidal), position = "dodge", stat="identity")
Simple sentiments (positive, negative, neutral)
load(file = "complete.RData")
#Make a table of the sentiments
# Table - only 10 sentiments in posts from
#suicidal users
complete <- complete %>%
mutate(score = calculate_score(text), post.id = row_number())
save(complete,file="complete.RData")
load(file="complete.RData")
#Plot the sentiment range
#hist(complete2$score)
ggplot(complete, aes(post.id, score, color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1")+
labs(title = "Sentiment score in mental health subreddits")+
theme(legend.position = c(0.2, 0.15))
Suicidal posts are less readable than non-suicidal ones (higher reading age/grade-level), so this looks like a good predictor.
#### Linguistic Features ###
#Readability
#install.packages('koRpus')
#install.packages('tm')
library(koRpus)
##
## Attaching package: 'koRpus'
## The following object is masked from 'package:dplyr':
##
## query
#library(tm)
#get my list of source files.
load(file="complete.RData")
#Write each post to its own file
n <-length(complete$text)
filepaths <-rep("",n)
for (i in 1:n){
str <-complete$text[i]
text<- paste(str,i,sep="")
filename <-paste("str/str",i,".txt",sep="")
filepaths[i]<-filename
write(str,file=filename)
}
#list of kRp.tagged object using tokenize which is the default tagger
#given with the koRpus package
ll.tagged <- lapply(filepaths, tokenize, lang="en")
#Once I have my list of "tagged" objects I can get flesch-kincaid readability, in age
ll.flesch <- lapply(ll.tagged,flesch.kincaid,quiet=TRUE)
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
## Warning: Text is relatively short (<100 tokens), results are probably not
## reliable!
age <-rep(0,n)
#
#Now write all those to a .txt file
for (i in 1:n){
age[i] <-attr(ll.flesch[[i]], which="Flesch.Kincaid")$age
}
complete <- complete %>%
mutate(age = age)
#Write complete data to file
save(complete,file = "complete.RData")
load(file="complete.RData")
#complete2 <- complete2 %>%
ggplot(complete, aes(post.id, age, color=factor(suicidal)))+
geom_point() +
# scale_color_brewer(palette="Set1") +
labs(title = "Reading age of posts and
comments in mental health subreddits")
#Looks like the readability of the suicidal posts is 15 or above?
#almost def above 30
#most are under 15 for both categories
For this section, note that value is percentage of the text that contains that word (e.g., the text “cat dog cat” is 66.7% “cat”).
load("complete.RData")
#Get the tokens out of the posts
post.tok <- complete %>%
mutate(linenumber=row_number()) %>%
unnest_tokens(word,text)
###Part 1: Stop words
###Part 1.A : Non-suicidal words
data("stop_words")
tidy.post.tok <- post.tok
#Get counts for each
suicidal.post.tok <- tidy.post.tok %>%
filter(suicidal=='suicidal') %>%
group_by(word) %>%
summarize(n=n()) %>%
arrange(desc(n))
not.suicidal.post.tok <- tidy.post.tok %>%
filter(suicidal=='not suicidal') %>%
group_by(word) %>%
summarize(n=n()) %>%
arrange(desc(n))
comparison <- suicidal.post.tok %>%
rename(sui = n) %>%
inner_join(not.suicidal.post.tok,by="word") %>%
rename(not.sui = n) %>%
mutate(sui = sui / sum(sui),
not.sui = not.sui / sum(not.sui),diff=sui-not.sui) %>%
arrange(diff)
#The words used by the depression NON-suicidal users is interesting.
#We could use these as predictors for the logistic regression.
head(comparison)
## # A tibble: 6 × 4
## word sui not.sui diff
## <chr> <dbl> <dbl> <dbl>
## 1 you 0.004815952 0.015738152 -0.010922199
## 2 your 0.001129584 0.004427475 -0.003297891
## 3 a 0.020663952 0.023517602 -0.002853650
## 4 is 0.008218233 0.010504051 -0.002285818
## 5 are 0.002360628 0.004373700 -0.002013072
## 6 can 0.002915274 0.004588800 -0.001673526
#The words most used by the suicidal users isn't useful,
#since we identify them by the words "die", "kill", etc.,
#so it's unsurprising the words in the phrases we used to
#appear the most frequently and don't appear much in the non-suicidal group.
tail(comparison)
## # A tibble: 6 × 4
## word sui not.sui diff
## <chr> <dbl> <dbl> <dbl>
## 1 to 0.037993263 0.035473579 0.002519685
## 2 and 0.032095074 0.029325303 0.002769772
## 3 want 0.006567822 0.002921775 0.003646047
## 4 me 0.015347466 0.010611601 0.004735864
## 5 my 0.022564630 0.014106976 0.008457653
## 6 i 0.068160604 0.050261705 0.017898899
range(comparison$diff)
## [1] -0.0109222 0.0178989
#Plot the difference in word use as graphs
plot.words <- melt(head(comparison)) %>% filter(variable!="diff")
## Using word as id variables
plot.words$word <- factor(plot.words$word,
levels = plot.words$word[order(comparison[1:10,]$diff)])
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
plot.words$word # notice the changed order of factor levels
## [1] you your a is are can you your a is are can
## Levels: you your a is are can you your a is
ggplot(plot.words, aes(word, value))+
geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
labs(title = "Word choice in posts and comments in mental health subreddits")+
theme(legend.position = c(0.86, 0.85))
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
#Now write all those to a column
n<-length(complete$text)
#these words are wrong, fix them
a_word<-rep(0,n)
is_word<-rep(0,n)
are_word<-rep(0,n)
can_word<-rep(0,n)
for (i in 1:n) {
a_word[i] <-str_count(complete$text[i],"a") + str_count(complete$text[i],"a")
is_word[i] <-str_count(complete$text[i]," is ") + str_count(complete$text[i],"Is ")
are_word[i] <-str_count(complete$text[i]," are ") + str_count(complete$text[i]," Are ")
can_word[i] <-str_count(complete$text[i]," can ") + str_count(complete$text[i],"Can ")
}
complete<- cbind(complete,a_word,is_word,are_word,
can_word)
save(complete,file = "complete.RData")
load(file="complete.RData")
#Plot the difference in word use as graphs
sui.words <-tail(comparison)
plot.words2 <- melt(sui.words) %>% filter(variable!="diff")
## Using word as id variables
plot.words2$word <- factor(plot.words2$word,
levels = plot.words2$word[order(sui.words$diff)])
plot.words2$word # notice the changed order of factor levels
## [1] to and want me my i to and want me my i
## Levels: to and want me my i
ggplot(plot.words2, aes(word, value))+
geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
labs(title = "Word choice in posts and comments in mental health subreddits")+
theme(legend.position = c(0.16, 0.85))
range(sui.words$diff)
## [1] 0.002519685 0.017898899
#Now write all those to a column
n<-length(complete$text)
to_word<-rep(0,n)
and_word<-rep(0,n)
want_word<-rep(0,n)
for (i in 1:n) {
to_word[i] <-str_count(complete$text[i]," to ") + str_count(complete$text[i]," To ")
and_word[i] <-str_count(complete$text[i]," and ") + str_count(complete$text[i],"And ")
want_word[i] <-str_count(complete$text[i],"want")
}
complete<- cbind(complete,to_word,and_word,want_word)
#get rid of stop words
data("stop_words")
tidy.post.tok <- post.tok %>%
anti_join(stop_words)
## Joining, by = "word"
#Get counts for each
suicidal.post.tok <- tidy.post.tok %>%
filter(suicidal=='suicidal') %>%
group_by(word) %>%
summarize(n=n()) %>%
arrange(desc(n))
not.suicidal.post.tok <- tidy.post.tok %>%
filter(suicidal=='not suicidal') %>%
group_by(word) %>%
summarize(n=n()) %>%
arrange(desc(n))
comparison <- suicidal.post.tok %>%
rename(sui = n) %>%
inner_join(not.suicidal.post.tok,by="word") %>%
rename(not.sui = n) %>%
mutate(sui = sui / sum(sui),
not.sui = not.sui / sum(not.sui),diff=sui-not.sui) %>%
arrange(diff)
#The words used by the depression NON-suicidal users is interesting.
#We could use these as predictors for the logistic regression.
head(comparison)
## # A tibble: 6 × 4
## word sui not.sui diff
## <chr> <dbl> <dbl> <dbl>
## 1 people 0.0126053905 0.016809653 -0.004204263
## 2 person 0.0041741534 0.006207714 -0.002033561
## 3 advice 0.0008845888 0.002510986 -0.001626397
## 4 hope 0.0018797512 0.003347981 -0.001468230
## 5 positive 0.0005805114 0.002022738 -0.001442227
## 6 anxiety 0.0031789910 0.004533724 -0.001354733
#The words most used by the suicidal users isn't useful,
#since we identify them by the words "die", "kill", etc.,
#so it's unsurprising the words in the phrases we used to
#appear the most frequently and don't appear much in the non-suicidal group.
tail(comparison)
## # A tibble: 6 × 4
## word sui not.sui diff
## <chr> <dbl> <dbl> <dbl>
## 1 dont 0.004284727 0.000697496 0.003587231
## 2 fucking 0.005860401 0.001673990 0.004186410
## 3 anymore 0.006330339 0.001394992 0.004935347
## 4 life 0.018852799 0.012833926 0.006018873
## 5 die 0.007767795 0.001325242 0.006442553
## 6 kill 0.007242571 0.000697496 0.006545075
range(comparison$diff)
## [1] -0.004204263 0.006545075
#Plot the difference in word use as graphs
plot.words <- melt(head(comparison)) %>% filter(variable!="diff")
## Using word as id variables
plot.words$word <- factor(plot.words$word,
levels = plot.words$word[order(comparison[1:10,]$diff)])
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
plot.words$word # notice the changed order of factor levels
## [1] people person advice hope positive anxiety people
## [8] person advice hope positive anxiety
## 10 Levels: people person advice hope positive anxiety people ... hope
ggplot(plot.words, aes(word, value))+
geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
labs(title = "Word choice in posts and comments in mental health subreddits")+
theme(legend.position = c(0.86, 0.85))
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
#Now write all those to a column
n<-length(complete$text)
#these words are wrong, fix them
people_word<-rep(0,n)
person_word<-rep(0,n)
hope_word<-rep(0,n)
advice_word<-rep(0,n)
positive_word<-rep(0,n)
anxiety_word<-rep(0,n)
for (i in 1:n) {
people_word[i] <-str_count(complete$text[i],"people")
person_word[i] <-str_count(complete$text[i],"person")
hope_word[i] <-str_count(complete$text[i],"hope")
advice_word[i] <-str_count(complete$text[i],"advice")
positive_word[i] <-str_count(complete$text[i],"positive")
anxiety_word[i] <-str_count(complete$text[i],"anxiety")
}
complete<- cbind(complete, people_word,person_word,hope_word,advice_word,
positive_word,anxiety_word)
save(complete,file = "complete.RData")
#Plot the difference in word use as graphs
sui.words <-tail(comparison)
plot.words2 <- melt(sui.words) %>% filter(variable!="diff")
## Using word as id variables
#sui.words<-plot.words2
#sui.words
#desc(sui.words$diff)
plot.words2$word <- factor(plot.words2$word,
levels = plot.words2$word[order(sui.words$diff)])
plot.words2$word # notice the changed order of factor levels
## [1] dont fucking anymore life die kill dont fucking
## [9] anymore life die kill
## Levels: dont fucking anymore life die kill
ggplot(plot.words2, aes(word, value))+
geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
labs(title = "Word choice in posts and comments in mental health subreddits")+
theme(legend.position = c(0.16, 0.85))
range(sui.words$diff)
## [1] 0.003587231 0.006545075
#Now write all those to a column
n<-length(complete$text)
kill_word<-rep(0,n)
die_word<-rep(0,n)
anymore_word<-rep(0,n)
life_word<-rep(0,n)
fucking_word <-rep(0,n)
dont_word <-rep(0,n)
for (i in 1:n) {
kill_word[i] <-str_count(complete$text[i],"kill")
die_word[i] <-str_count(complete$text[i]," die ") +
str_count(complete$text[i]," die.") +
str_count(complete$text[i]," die!") +
str_count(complete$text[i]," die?") +
str_count(complete$text[i]," died") +
str_count(complete$text[i]," DIE")
life_word[i] <-str_count(complete$text[i],"life")
anymore_word[i] <-str_count(complete$text[i],"anymore")
fucking_word[i] <-str_count(complete$text[i],"fucking")
dont_word[i] <-str_count(complete$text[i],"don't")
}
complete<- cbind(complete, kill_word,die_word,anymore_word,
life_word,fucking_word,dont_word)
save(complete,file = "complete.RData")
Suicidal people exhibit more self-attentional focus, so their pronoun use looks to be of interest.
#2. Higher self-attentional focus
#Pronouns
load(file="complete.RData")
#Now write all those to a column
n<-length(complete$text)
first_pronouns<-rep(0,n)
for (i in 1:n) {
first_pronouns[i] <-str_count(complete$text[i]," I ") +
str_count(complete$text[i]," i ") +
str_count(complete$text[i],"I’m") +
str_count(complete$text[i],"I‘d") +
str_count(complete$text[i],"I‘ll") +
str_count(complete$text[i],"I’ve") +
str_count(complete$text[i]," me ") +
str_count(complete$text[i]," me.") +
str_count(complete$text[i]," me?") +
str_count(complete$text[i]," me!") +
str_count(complete$text[i]," my ") +
str_count(complete$text[i]," My ")
}
complete<- cbind(complete, first_pronouns)
save(complete,file = "complete.RData")
ggplot(complete, aes(post.id, first_pronouns,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "First-person pronouns in reddit posts")+
theme(legend.position = c(0.1, 0.9))
#2nd person
n<-length(complete$text)
sec_pronouns<-rep(0,n)
for (i in 1:n) {
sec_pronouns[i] <-str_count(complete$text[i]," you ") + str_count(complete$text[i],"You") +
str_count(complete$text[i]," you’re ") + str_count(complete$text[i],"You're") +
str_count(complete$text[i],"you’d") + str_count(complete$text[i],"You'd") +
str_count(complete$text[i],"you’ll") + str_count(complete$text[i],"You'll") +
str_count(complete$text[i],"you’ve") + str_count(complete$text[i],"You've") +
str_count(complete$text[i],"your") + str_count(complete$text[i],"Your") +
str_count(complete$text[i],"yours")
}
complete<- cbind(complete, sec_pronouns)
save(complete,file = "complete.RData")
ggplot(complete, aes(post.id, sec_pronouns,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "Second-person pronouns in reddit posts")+
theme(legend.position = c(0.1, 0.9))
#3rd person
source(file="pronoun_dict.R")
third.pronouns <-as.data.frame(word.list[3])
colnames(third.pronouns) <-"word"
#suicidal
sui.post.tok <- complete %>%
filter(suicidal=='suicidal') %>%
unnest_tokens(word,text)
sui.pronoun.post.tok <- sui.post.tok %>%
group_by(word) %>%
summarize(n=n()) %>%
arrange(desc(n))
sui.third = inner_join(sui.post.tok, third.pronouns,by="word")
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
sui.results <- sui.third %>%
group_by(word) %>%
summarize(n=n()) %>%
arrange(desc(n))
#Not suicidal
not.sui.post.tok <- complete %>%
filter(suicidal=='not suicidal') %>%
unnest_tokens(word,text)
not.sui.pronoun.post.tok <- not.sui.post.tok %>%
group_by(word) %>%
summarize(n=n()) %>%
arrange(desc(n))
not.sui.third = inner_join(not.sui.pronoun.post.tok, third.pronouns,by="word")
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
not.sui.results <- not.sui.third %>%
group_by(word) %>%
summarize(n=n()) %>%
arrange(desc(n))
third.comparison <- sui.results %>%
rename(sui = n) %>%
inner_join(not.sui.results,by="word") %>%
rename(not.sui = n) %>%
mutate(sui = sui / sum(sui),
not.sui = not.sui / sum(not.sui),diff=sui-not.sui) %>%
arrange(diff)
#Plot the difference in word use as graphs
library(reshape2)
#plot.words <- melt(comparison[1:10,]) %>% filter(variable!="diff")
plot.words <- melt(third.comparison) %>% filter(variable!="diff")
## Using word as id variables
plot.words$word <- factor(plot.words$word,
levels = plot.words$word[order(third.comparison$diff)])
plot.words$word # notice the changed order of factor levels
## [1] hers it’s its his their him them he they her it
## [12] hers it’s its his their him them he they her it
## Levels: hers it’s its his their him them he they her it
ggplot(plot.words, aes(word, value))+
geom_bar(aes(fill = variable), position = "dodge", stat="identity")+
labs(title = "Word choice in posts and comments in mental health subreddits")+
theme(legend.position = c(0.86, 0.85))
#it looks predictive
#her and it words predict suicidal thoughts, all other pronouns are not suicidal
her_word<-rep(0,nrow(complete))
for (i in 1:n) {
her_word[i] <-
str_count(complete$text[i]," her ") + str_count(complete$text[i]," Her ") +
str_count(complete$text[i]," her.") + str_count(complete$text[i],"her!") +
str_count(complete$text[i]," her? ")
}
it_word<-rep(0,nrow(complete))
for (i in 1:n) {
it_word[i] <-str_count(complete$text[i]," I ") +
str_count(complete$text[i]," it ") + str_count(complete$text[i],"It ") +
str_count(complete$text[i]," it. ") + str_count(complete$text[i],"it!") +
str_count(complete$text[i]," it? ")
}
third_pronouns<-rep(0,nrow(complete))
for (i in 1:n) {
third_pronouns[i]<-
str_count(complete$text[i]," hers ") + str_count(complete$text[i],"Her ") +
str_count(complete$text[i],"it's") + str_count(complete$text[i],"It's") +
str_count(complete$text[i]," its ") + str_count(complete$text[i],"Its") +
str_count(complete$text[i]," his ") + str_count(complete$text[i]," His ") +
str_count(complete$text[i],"their") + str_count(complete$text[i],"Their") +
str_count(complete$text[i]," he ") + str_count(complete$text[i],"He ") +
str_count(complete$text[i]," him") +
str_count(complete$text[i],"them") +
str_count(complete$text[i],"they") + str_count(complete$text[i],"They")
}
complete<-cbind(complete,third_pronouns,it_word,her_word)
save(complete,file="complete.RData")
load("complete.RData")
ggplot(complete, aes(post.id, it_word,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "Use of 'it' in reddit posts")+
theme(legend.position = c(0.1, 0.9))
ggplot(complete, aes(post.id, third_pronouns,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "Third-person pronouns in reddit posts")+
theme(legend.position = c(0.1, 0.9))
ggplot(complete, aes(post.id, her_word,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "Use of word 'her' in reddit posts")+
theme(legend.position = c(0.1, 0.9))
These were chosen based on the subject matter of the hand classified posts.
girl_word<-rep(0,nrow(complete))
for (i in 1:n) {
girl_word[i]<-str_count(complete$text[i],"girl") + str_count(complete$text[i],"Girl")
}
family_words<-rep(0,nrow(complete))
for (i in 1:n) {
family_words[i]<- str_count(complete$text[i],"Mom") + str_count(complete$text[i],"mom") +
str_count(complete$text[i],"dad") + str_count(complete$text[i],"Dad") +
str_count(complete$text[i],"parents") + str_count(complete$text[i],"family") +
str_count(complete$text[i],"brother") + str_count(complete$text[i],"sister") +
str_count(complete$text[i],"cousin")
}
job_words<-rep(0,nrow(complete))
for (i in 1:n) {
job_words[i]<- str_count(complete$text[i],"job") + str_count(complete$text[i],"employ")
}
friend_words<-rep(0,nrow(complete))
for (i in 1:n) {
friend_words[i]<- str_count(complete$text[i],"friend") + str_count(complete$text[i],"Friend")
}
lone_words<-rep(0,nrow(complete))
for (i in 1:n) {
lone_words[i]<- str_count(complete$text[i],"lone") +
str_count(complete$text[i],"no one") + str_count(complete$text[i],"No one")
}
therapy_words<-rep(0,nrow(complete))
for (i in 1:n) {
therapy_words[i]<- str_count(complete$text[i],"psychiatr") + str_count(complete$text[i],"Psychiatr")
str_count(complete$text[i],"therap") + str_count(complete$text[i],"Therap")
}
help_word<-rep(0,nrow(complete))
for (i in 1:n) {
help_word[i]<- str_count(complete$text[i],"help")
}
complete<-cbind(complete,girl_word,family_words,job_words,friend_words,lone_words,therapy_words,help_word)
save(complete,file="complete.RData")
load("complete.RData")
ggplot(complete, aes(post.id, girl_word,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "Use of 'girl' in reddit posts")+
theme(legend.position = c(0.1, 0.9))
ggplot(complete, aes(post.id, family_words,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "Word choice in reddit posts")+
theme(legend.position = c(0.1, 0.9))
ggplot(complete, aes(post.id, friend_words,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "Word choice in reddit posts")+
theme(legend.position = c(0.1, 0.9))
ggplot(complete, aes(post.id, lone_words,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "Word choice in reddit posts")+
theme(legend.position = c(0.1, 0.9))
ggplot(complete, aes(post.id, therapy_words,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "Word choice in reddit posts")+
theme(legend.position = c(0.1, 0.9))
ggplot(complete, aes(post.id, help_word,color=factor(suicidal)))+
geom_point() +
scale_color_brewer(palette="Set1") +
labs(title = "Word choice in reddit posts")+
theme(legend.position = c(0.1, 0.9))
#Change all word counts to log
predictors<-colnames(complete)[11:40]
n<-length(predictors)
mut_text=""
for (i in 1:n){
currvar <-predictors[i]
if (i==n){
mut_text = paste(mut_text,'mutate(', currvar, '=log(',currvar, '+1))', sep="")
}else{
mut_text = paste(mut_text,'mutate(', currvar, '=log(',currvar, '+1)) %>% ', sep="")
}
}
complete2<- complete %>%
mutate(wc=log(wc+1)) %>%
mutate(score=log(score+100)) %>%
# mutate(grade=log(grade+5)) %>%
mutate(age=log(age)) %>%
mutate(a_word = log(a_word+1)) %>%
mutate(is_word =log(is_word+1)) %>%
mutate(are_word = log(are_word+1)) %>%
mutate(can_word=log(can_word+1)) %>%
mutate(to_word=log(to_word+1)) %>%
mutate(and_word=log(and_word+1)) %>%
mutate(want_word = log(want_word+1)) %>%
mutate(people_word=log(people_word+1)) %>%
mutate(person_word = log(person_word+1)) %>%
mutate(hope_word=log(hope_word+1)) %>%
mutate(advice_word = log(advice_word+1)) %>%
mutate(positive_word = log(positive_word+1)) %>%
mutate(anxiety_word = log(anxiety_word+1)) %>%
mutate(kill_word=log(kill_word+1)) %>%
mutate(die_word=log(die_word+1)) %>% #
mutate(anymore_word = log(anymore_word+1)) %>%
mutate(life_word = log(life_word+1)) %>%
mutate(fucking_word = log(fucking_word+1)) %>%
mutate(dont_word = log(dont_word+1)) %>%
mutate(first_pronouns=log(first_pronouns+1)) %>%
mutate(sec_pronouns=log(sec_pronouns+1)) %>%
mutate(third_pronouns=log(third_pronouns+1)) %>%
mutate(it_word=log(it_word+1)) %>% #
mutate(her_word=log(her_word+1)) %>%
mutate(girl_word=log(girl_word+1)) %>%
mutate(family_words=log(family_words+1)) %>%
mutate(job_words=log(job_words+1)) %>%
mutate(friend_words = log(friend_words+1)) %>%
mutate(lone_words = log(lone_words+1)) %>% #
mutate(therapy_words=log(therapy_words+1)) %>%
mutate(help_word=log(help_word+1))
save(complete2,file = "complete2.RData")
load("complete2.RData")
log_predictors <-colnames(complete2[,10:40])
#No y-limit
for (i in 1:length(log_predictors)){
curr_var <-log_predictors[i]
eval(parse(text=paste('p',i,'<- ggplot(complete2, aes(post.id, ',curr_var,',color=factor(suicidal)))+geom_point() +ylim(0,10)+scale_color_brewer(palette="Set1")',sep="")))
}
#They've all been logged
for (i in 1:length(log_predictors)){
curr_var <-log_predictors[i]
eval(parse(text=paste('print(p',i,')',sep="")))
}
load("complete.RData")
#Edit the dataframe to change 'suicidal' to 1,
load("complete.RData")
complete$suicidal<-gsub("not suicidal",0, complete$suicidal)
complete$suicidal<-gsub("suicidal",1, complete$suicidal)
complete$suicidal<-as.integer(complete$suicidal)
save(complete, file="complete.RData")
load("complete2.RData")
complete2$suicidal<-gsub("not suicidal",0, complete2$suicidal)
complete2$suicidal<-gsub("suicidal",1, complete2$suicidal)
complete2$suicidal<-as.integer(complete2$suicidal)
save(complete2, file="complete2.RData")
######################
#Prepare the datasets
load("complete2.RData")
#Select the outcome and predictors
data.for.analysis <-complete2 %>%
select(suicidal)
data.for.analysis <-cbind(data.for.analysis,complete2[,7:40])
save(data.for.analysis,file = "Data_for_analysis.RData")
load("Data_for_analysis.RData")
##############
#Get your training and test sets
set.seed(1)
train.indices<-sample(1368,1024)
training.set <-data.for.analysis[train.indices,]
test.set <-data.for.analysis[-train.indices,]
#Fit the full model
fit_full<-glm(suicidal~.,data=training.set,family=binomial)
summary(fit_full)
##
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.8857 -0.5292 -0.2558 0.3963 2.8743
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.44843 1.73561 -2.563 0.010376 *
## wc 0.20403 0.45725 0.446 0.655440
## score 0.06910 0.31755 0.218 0.827742
## age 0.47414 0.27203 1.743 0.081341 .
## a_word -0.20037 0.33190 -0.604 0.546044
## is_word -0.51352 0.22106 -2.323 0.020182 *
## are_word 0.05483 0.28871 0.190 0.849367
## can_word -0.08188 0.27946 -0.293 0.769533
## to_word 0.02976 0.22498 0.132 0.894767
## and_word -0.57828 0.21436 -2.698 0.006983 **
## want_word 1.26747 0.25512 4.968 6.76e-07 ***
## people_word -0.72366 0.27134 -2.667 0.007655 **
## person_word -1.22592 0.40391 -3.035 0.002404 **
## hope_word -0.34760 0.43962 -0.791 0.429138
## advice_word -2.07378 0.71325 -2.907 0.003643 **
## positive_word -2.97012 0.89981 -3.301 0.000964 ***
## anxiety_word -0.66357 0.38410 -1.728 0.084062 .
## kill_word 3.61617 0.45457 7.955 1.79e-15 ***
## die_word 1.10130 0.18364 5.997 2.01e-09 ***
## anymore_word 1.83588 0.50688 3.622 0.000292 ***
## life_word 0.88977 0.26926 3.304 0.000952 ***
## fucking_word 1.22195 0.49813 2.453 0.014165 *
## dont_word -0.16658 0.23545 -0.708 0.479246
## first_pronouns 0.91727 0.23109 3.969 7.21e-05 ***
## sec_pronouns -0.32148 0.15503 -2.074 0.038115 *
## third_pronouns 0.08423 0.16888 0.499 0.617933
## it_word 0.14888 0.21487 0.693 0.488372
## her_word -0.36635 0.16966 -2.159 0.030826 *
## girl_word 0.04711 0.43459 0.108 0.913679
## family_words 0.18362 0.25842 0.711 0.477363
## job_words -0.84854 0.33689 -2.519 0.011778 *
## friend_words 0.06332 0.27134 0.233 0.815483
## lone_words 0.41442 0.36885 1.124 0.261207
## therapy_words -1.26600 0.79588 -1.591 0.111681
## help_word -0.08585 0.28789 -0.298 0.765538
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1406.85 on 1023 degrees of freedom
## Residual deviance: 717.39 on 989 degrees of freedom
## AIC: 787.39
##
## Number of Fisher Scoring iterations: 6
#Select a model using backwards stepwise selection
fit_reduced = step(fit_full,trace=0)
summary(fit_reduced)
##
## Call:
## glm(formula = suicidal ~ age + is_word + and_word + want_word +
## people_word + person_word + advice_word + positive_word +
## anxiety_word + kill_word + die_word + anymore_word + life_word +
## fucking_word + first_pronouns + sec_pronouns + her_word +
## job_words + therapy_words, family = binomial, data = training.set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7409 -0.5380 -0.2591 0.3944 2.8022
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.9790 0.6477 -6.143 8.09e-10 ***
## age 0.4685 0.2619 1.789 0.073561 .
## is_word -0.4810 0.2102 -2.288 0.022126 *
## and_word -0.5610 0.1915 -2.929 0.003395 **
## want_word 1.2998 0.2393 5.432 5.57e-08 ***
## people_word -0.6354 0.2530 -2.511 0.012028 *
## person_word -1.2111 0.3913 -3.095 0.001967 **
## advice_word -2.0931 0.6777 -3.088 0.002013 **
## positive_word -3.0176 0.8673 -3.479 0.000503 ***
## anxiety_word -0.6872 0.3688 -1.864 0.062378 .
## kill_word 3.6636 0.4500 8.142 3.89e-16 ***
## die_word 1.1538 0.1782 6.474 9.56e-11 ***
## anymore_word 1.8754 0.4890 3.835 0.000125 ***
## life_word 0.8749 0.2564 3.412 0.000646 ***
## fucking_word 1.2409 0.4828 2.570 0.010156 *
## first_pronouns 1.0226 0.1491 6.859 6.96e-12 ***
## sec_pronouns -0.3345 0.1355 -2.468 0.013594 *
## her_word -0.2909 0.1556 -1.869 0.061603 .
## job_words -0.7721 0.3236 -2.386 0.017036 *
## therapy_words -1.3362 0.7567 -1.766 0.077401 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1406.85 on 1023 degrees of freedom
## Residual deviance: 722.19 on 1004 degrees of freedom
## AIC: 762.19
##
## Number of Fisher Scoring iterations: 6
formula(fit_reduced)
## suicidal ~ age + is_word + and_word + want_word + people_word +
## person_word + advice_word + positive_word + anxiety_word +
## kill_word + die_word + anymore_word + life_word + fucking_word +
## first_pronouns + sec_pronouns + her_word + job_words + therapy_words
training.subset <-training.set %>%
select(c(suicidal,age , is_word , and_word , want_word , people_word ,
person_word , advice_word , positive_word , anxiety_word ,
kill_word , die_word , anymore_word , life_word , fucking_word ,
first_pronouns , sec_pronouns , her_word , job_words , therapy_words))
#Check for collinearity
vif(fit_reduced)
## age is_word and_word want_word people_word
## 1.118508 1.771984 3.530709 1.507238 1.358033
## person_word advice_word positive_word anxiety_word kill_word
## 1.355272 1.096909 1.128786 1.185493 1.121491
## die_word anymore_word life_word fucking_word first_pronouns
## 1.372880 1.100012 1.456701 1.087341 3.153895
## sec_pronouns her_word job_words therapy_words
## 1.129326 1.476929 1.304697 1.057962
# get rid of 'first_pronouns" and 'and_words' as a predictor
#since the VIF are > 2.5
#Refit model
training.subset2 <-training.subset %>%
select(-c(and_word,first_pronouns))
fit_reduced2<-glm(suicidal~.,data=training.subset2,family=binomial)
summary(fit_reduced2)
##
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0174 -0.5903 -0.3639 0.4281 2.7556
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1506 0.5577 -5.649 1.61e-08 ***
## age 0.6587 0.2301 2.862 0.004207 **
## is_word -0.2366 0.1991 -1.188 0.234729
## want_word 1.5662 0.2292 6.833 8.33e-12 ***
## people_word -0.4371 0.2431 -1.798 0.072226 .
## person_word -1.1131 0.3953 -2.816 0.004864 **
## advice_word -1.7666 0.6788 -2.603 0.009253 **
## positive_word -3.0162 0.8869 -3.401 0.000672 ***
## anxiety_word -0.4394 0.3697 -1.188 0.234641
## kill_word 3.7599 0.4181 8.993 < 2e-16 ***
## die_word 1.3668 0.1712 7.983 1.43e-15 ***
## anymore_word 2.2008 0.4898 4.494 7.01e-06 ***
## life_word 1.0966 0.2472 4.436 9.14e-06 ***
## fucking_word 1.3873 0.4933 2.812 0.004918 **
## sec_pronouns -0.4352 0.1328 -3.278 0.001045 **
## her_word -0.1849 0.1521 -1.215 0.224206
## job_words -0.5572 0.3235 -1.722 0.084992 .
## therapy_words -1.2466 0.7912 -1.576 0.115131
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1406.85 on 1023 degrees of freedom
## Residual deviance: 777.56 on 1006 degrees of freedom
## AIC: 813.56
##
## Number of Fisher Scoring iterations: 6
#Use BSS to drop non-significant predictors
training.subset3 <- training.subset2 %>%
select(-is_word)
fit_reduced3 <-glm(suicidal~.,data=training.subset3,family=binomial)
summary(fit_reduced3)
##
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.9944 -0.5933 -0.3638 0.4335 2.6387
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1706 0.5579 -5.683 1.32e-08 ***
## age 0.6568 0.2301 2.854 0.004314 **
## want_word 1.5385 0.2272 6.772 1.27e-11 ***
## people_word -0.4739 0.2410 -1.967 0.049229 *
## person_word -1.1241 0.3933 -2.858 0.004259 **
## advice_word -1.7637 0.6713 -2.627 0.008602 **
## positive_word -3.0874 0.8854 -3.487 0.000489 ***
## anxiety_word -0.4792 0.3700 -1.295 0.195260
## kill_word 3.6893 0.4110 8.976 < 2e-16 ***
## die_word 1.3397 0.1693 7.915 2.47e-15 ***
## anymore_word 2.1556 0.4899 4.401 1.08e-05 ***
## life_word 1.0263 0.2389 4.295 1.74e-05 ***
## fucking_word 1.3751 0.4946 2.780 0.005434 **
## sec_pronouns -0.4595 0.1308 -3.512 0.000444 ***
## her_word -0.2188 0.1498 -1.461 0.143997
## job_words -0.5554 0.3239 -1.715 0.086342 .
## therapy_words -1.3257 0.7792 -1.701 0.088887 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1406.85 on 1023 degrees of freedom
## Residual deviance: 778.99 on 1007 degrees of freedom
## AIC: 812.99
##
## Number of Fisher Scoring iterations: 6
training.subset4 <- training.subset3 %>%
select(-anxiety_word)
fit_reduced4 = glm(suicidal~.,data=training.subset4,family=binomial)
summary(fit_reduced4)
##
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0923 -0.5916 -0.3657 0.4474 2.6409
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1011 0.5538 -5.600 2.14e-08 ***
## age 0.6245 0.2285 2.733 0.006280 **
## want_word 1.5141 0.2257 6.709 1.97e-11 ***
## people_word -0.4606 0.2403 -1.917 0.055255 .
## person_word -1.1058 0.3926 -2.817 0.004854 **
## advice_word -1.8062 0.6750 -2.676 0.007452 **
## positive_word -3.1450 0.9151 -3.437 0.000589 ***
## kill_word 3.6479 0.4073 8.956 < 2e-16 ***
## die_word 1.3123 0.1668 7.870 3.56e-15 ***
## anymore_word 2.1474 0.4906 4.378 1.20e-05 ***
## life_word 1.0115 0.2382 4.247 2.17e-05 ***
## fucking_word 1.3453 0.4972 2.706 0.006815 **
## sec_pronouns -0.4622 0.1310 -3.527 0.000420 ***
## her_word -0.2181 0.1489 -1.464 0.143059
## job_words -0.5711 0.3236 -1.765 0.077593 .
## therapy_words -1.4014 0.8050 -1.741 0.081704 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1406.85 on 1023 degrees of freedom
## Residual deviance: 780.64 on 1008 degrees of freedom
## AIC: 812.64
##
## Number of Fisher Scoring iterations: 6
training.subset5 <- training.subset4 %>%
select(-her_word)
fit_reduced5 = glm(suicidal~.,data=training.subset5,family=binomial)
summary(fit_reduced5)
##
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0381 -0.5932 -0.3634 0.4473 2.5511
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1118 0.5529 -5.628 1.82e-08 ***
## age 0.6208 0.2281 2.722 0.006495 **
## want_word 1.4498 0.2207 6.570 5.04e-11 ***
## people_word -0.4523 0.2398 -1.886 0.059284 .
## person_word -1.1527 0.3904 -2.952 0.003152 **
## advice_word -1.8904 0.6767 -2.793 0.005217 **
## positive_word -3.1635 0.9348 -3.384 0.000714 ***
## kill_word 3.6113 0.4072 8.870 < 2e-16 ***
## die_word 1.2947 0.1655 7.821 5.22e-15 ***
## anymore_word 2.1228 0.4903 4.330 1.49e-05 ***
## life_word 0.9609 0.2358 4.074 4.61e-05 ***
## fucking_word 1.3157 0.4990 2.636 0.008381 **
## sec_pronouns -0.4557 0.1305 -3.492 0.000479 ***
## job_words -0.6231 0.3256 -1.914 0.055672 .
## therapy_words -1.4103 0.8007 -1.761 0.078165 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1406.8 on 1023 degrees of freedom
## Residual deviance: 782.8 on 1009 degrees of freedom
## AIC: 812.8
##
## Number of Fisher Scoring iterations: 6
#Deviance Residuals identify observations not well explained by the model.
resids.deviance<-residuals(fit_reduced5, type = c("deviance"))
plot(resids.deviance,training.subset5$post.id)
#a couple of outliers<-3 but nothing too crazy
predictor.names<-colnames(training.subset5)[-1]
n<-length(predictor.names)
#Make plots
for (i in 1:n){
currvar <- predictor.names[i]
eval(parse(text=paste('plot(training.subset5$',currvar,',resids.deviance)',sep="")))
}
Overall seems to not have many outliers…most within [-3,3], a few at -4
#Hat Matrix Diagonal detects extreme large points in the design space.
#These are often labeled as "leverage" or "hi" and are related to standardized residuals.
#A general rule says that if hi > 2*p/n or > 3*p/n the points is influential.
#Here "p" is the number of parameters in the model and "n" the number of observations.
#Here, 3*20/1024 = 0.05859375
#hats<-influence.measures(fit_reduced)$hat
hats<-hatvalues(fit_reduced5)
training.subset5 <- training.subset5 %>% mutate(index = row_number())
plot(hats,training.subset5$index)
training.subset5 <- training.subset5 %>%
select(-index)
There are quite a lot of possibly influential points. We will get rid of only the extreme outliers.
#Check if removing them does anything to the model
training.subset6<- cbind(training.subset5,resids.deviance,hats) %>%
filter(!resids.deviance < -3) %>%
filter(!hats > 0.2) %>%
select(-c(resids.deviance,hats))
fit6 = glm(suicidal~.,data=training.subset6,family=binomial)
summary(fit6)
##
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset6)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6180 -0.5504 -0.3473 0.3439 2.6400
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1969 0.5772 -5.539 3.04e-08 ***
## age 0.5865 0.2380 2.464 0.013735 *
## want_word 1.4990 0.2327 6.442 1.18e-10 ***
## people_word -0.3750 0.2514 -1.492 0.135823
## person_word -0.9363 0.4132 -2.266 0.023444 *
## advice_word -1.1724 0.7562 -1.550 0.121036
## positive_word -3.0843 0.9920 -3.109 0.001877 **
## kill_word 4.3369 0.4877 8.892 < 2e-16 ***
## die_word 1.4581 0.1762 8.277 < 2e-16 ***
## anymore_word 1.9412 0.5062 3.835 0.000126 ***
## life_word 1.0824 0.2490 4.347 1.38e-05 ***
## fucking_word 2.0863 0.6667 3.129 0.001753 **
## sec_pronouns -0.4822 0.1370 -3.520 0.000431 ***
## job_words -0.7446 0.3519 -2.116 0.034373 *
## therapy_words -1.3182 1.1235 -1.173 0.240665
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1398.16 on 1016 degrees of freedom
## Residual deviance: 721.35 on 1002 degrees of freedom
## AIC: 751.35
##
## Number of Fisher Scoring iterations: 6
#It does, so we remove the outliers and the predictors
#Therapy, advice, and people should be dropped
#Use BSS
training.subset7 <- training.subset6 %>%
select(-therapy_words)
fit_reduced7 = glm(suicidal~.,data=training.subset7,family=binomial)
summary(fit_reduced7)
##
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset7)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6029 -0.5493 -0.3518 0.3411 2.6455
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1815 0.5768 -5.516 3.47e-08 ***
## age 0.5770 0.2378 2.426 0.015271 *
## want_word 1.5028 0.2322 6.472 9.66e-11 ***
## people_word -0.3627 0.2515 -1.442 0.149289
## person_word -0.9112 0.4131 -2.206 0.027389 *
## advice_word -1.1850 0.7632 -1.553 0.120505
## positive_word -3.0449 0.9859 -3.089 0.002011 **
## kill_word 4.3295 0.4888 8.858 < 2e-16 ***
## die_word 1.4363 0.1745 8.229 < 2e-16 ***
## anymore_word 1.9445 0.5072 3.834 0.000126 ***
## life_word 1.0779 0.2489 4.331 1.49e-05 ***
## fucking_word 2.0894 0.6678 3.129 0.001754 **
## sec_pronouns -0.4855 0.1369 -3.547 0.000389 ***
## job_words -0.7626 0.3514 -2.170 0.029984 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1398.16 on 1016 degrees of freedom
## Residual deviance: 722.76 on 1003 degrees of freedom
## AIC: 750.76
##
## Number of Fisher Scoring iterations: 6
training.subset8 <- training.subset7 %>%
select(-people_word)
fit_reduced8 = glm(suicidal~.,data=training.subset8,family=binomial)
summary(fit_reduced8)
##
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset8)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6246 -0.5459 -0.3496 0.3593 2.6851
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1435 0.5727 -5.489 4.04e-08 ***
## age 0.5513 0.2358 2.338 0.019405 *
## want_word 1.4412 0.2274 6.339 2.32e-10 ***
## person_word -0.9912 0.4093 -2.422 0.015443 *
## advice_word -1.1507 0.7681 -1.498 0.134092
## positive_word -2.9716 0.9710 -3.060 0.002211 **
## kill_word 4.3263 0.4886 8.854 < 2e-16 ***
## die_word 1.4173 0.1731 8.188 2.65e-16 ***
## anymore_word 1.9050 0.4996 3.813 0.000137 ***
## life_word 1.0630 0.2484 4.279 1.88e-05 ***
## fucking_word 2.0111 0.6567 3.062 0.002197 **
## sec_pronouns -0.5154 0.1347 -3.826 0.000130 ***
## job_words -0.7815 0.3506 -2.229 0.025804 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1398.16 on 1016 degrees of freedom
## Residual deviance: 724.88 on 1004 degrees of freedom
## AIC: 750.88
##
## Number of Fisher Scoring iterations: 6
training.subset9 <- training.subset8 %>%
select(-advice_word)
fit_reduced9 = glm(suicidal~.,data=training.subset9,family=binomial)
summary(fit_reduced9)
##
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset9)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6002 -0.5447 -0.3543 0.3430 2.7054
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1232 0.5706 -5.473 4.42e-08 ***
## age 0.5409 0.2350 2.302 0.021348 *
## want_word 1.4144 0.2253 6.278 3.42e-10 ***
## person_word -0.9980 0.4061 -2.458 0.013977 *
## positive_word -2.9364 0.9663 -3.039 0.002375 **
## kill_word 4.3209 0.4888 8.840 < 2e-16 ***
## die_word 1.4067 0.1723 8.166 3.18e-16 ***
## anymore_word 1.8907 0.4970 3.804 0.000142 ***
## life_word 1.0449 0.2459 4.248 2.15e-05 ***
## fucking_word 2.0486 0.6572 3.117 0.001827 **
## sec_pronouns -0.5331 0.1342 -3.972 7.13e-05 ***
## job_words -0.7492 0.3476 -2.155 0.031143 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1398.16 on 1016 degrees of freedom
## Residual deviance: 727.26 on 1005 degrees of freedom
## AIC: 751.26
##
## Number of Fisher Scoring iterations: 6
#This looks non-linear because of the predictors
predictor.names<-colnames(training.subset9)[-1]
n<-length(predictor.names)
mut_text=""
#Means
for (i in 1:n){
if (i==n){
currvar<-predictor.names[i]
mut_text = paste(mut_text,'mutate(', currvar, '=mean(training.subset9$',currvar,'))', sep="")
} else {
currvar<-predictor.names[i]
mut_text = paste(mut_text,'mutate(', currvar, '=mean(training.subset9$',currvar,')) %>% ', sep="")
}
}
mean_data_text <- paste('mean_data<- training.subset9 %>% ',mut_text,sep="")
mean_data_text
## [1] "mean_data<- training.subset9 %>% mutate(age=mean(training.subset9$age)) %>% mutate(want_word=mean(training.subset9$want_word)) %>% mutate(person_word=mean(training.subset9$person_word)) %>% mutate(positive_word=mean(training.subset9$positive_word)) %>% mutate(kill_word=mean(training.subset9$kill_word)) %>% mutate(die_word=mean(training.subset9$die_word)) %>% mutate(anymore_word=mean(training.subset9$anymore_word)) %>% mutate(life_word=mean(training.subset9$life_word)) %>% mutate(fucking_word=mean(training.subset9$fucking_word)) %>% mutate(sec_pronouns=mean(training.subset9$sec_pronouns)) %>% mutate(job_words=mean(training.subset9$job_words))"
eval(parse(text=mean_data_text))
#Make plots
for (i in 1:n){
currvar <- predictor.names[i]
othervar <-predictor.names[-i]
#Add the non-modified currvar to the temp
temp <- mean_data
eval(parse(text=paste('temp$',currvar,'<-training.subset9$',currvar,sep="")))
#Get the predictions, with the other variables held constant
predictions <- predict(fit_reduced9,temp,type="response")
log.odds<-predictions
temp <- temp %>% mutate(log.odds = log.odds)
#Get the plot
eval(parse(text=paste('plot(temp$',currvar,',temp$log.odds)',sep="")))
}
Kill is not linear, so we drop it to avoid biasing the model too much Fucking,anymore, die, an want are not super linear either, but they aren’t as extreme and are very predictive, so we keep them.
training.subset10 <- training.subset9 %>%
#select(-c(fucking_word,anymore_word, die_word, kill_word, want_word))
select(-kill_word)
fit_reduced10<-glm(suicidal~.,data=training.subset10,family=binomial)
summary(fit_reduced10)
##
## Call:
## glm(formula = suicidal ~ ., family = binomial, data = training.subset10)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8939 -0.6442 -0.4338 0.5301 2.5246
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.9049 0.5240 -5.544 2.96e-08 ***
## age 0.5870 0.2165 2.711 0.006699 **
## want_word 1.6681 0.2003 8.329 < 2e-16 ***
## person_word -0.8294 0.3540 -2.343 0.019133 *
## positive_word -2.5071 0.7601 -3.298 0.000973 ***
## die_word 1.2780 0.1604 7.968 1.61e-15 ***
## anymore_word 2.0394 0.4430 4.604 4.15e-06 ***
## life_word 1.0977 0.2224 4.936 7.97e-07 ***
## fucking_word 2.2975 0.6131 3.748 0.000179 ***
## sec_pronouns -0.4770 0.1190 -4.009 6.08e-05 ***
## job_words -0.5213 0.3240 -1.609 0.107570
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1398.16 on 1016 degrees of freedom
## Residual deviance: 865.41 on 1006 degrees of freedom
## AIC: 887.41
##
## Number of Fisher Scoring iterations: 6
fit_final<- fit_reduced10
#ROC curve
train.predictions <- predict(fit_final,training.subset10,type="response")
n<-length(train.predictions)
results<-rep(1,n)
for (i in 1:n){
#Classify >0.5 as suicidal
if (train.predictions[i]>=0.5){
results[i]=1
} else {
#Classify <0.5 as not suicidal
results[i]=0
}
}
table(results, training.subset10$suicidal)
##
## results 0 1
## 0 503 121
## 1 60 333
plot(roc(training.subset10$suicidal,results),main="ROC curve for training data")
##
## Call:
## roc.default(response = training.subset10$suicidal, predictor = results)
##
## Data: results in 563 controls (training.subset10$suicidal 0) < 454 cases (training.subset10$suicidal 1).
## Area under the curve: 0.8135
auc(training.subset10$suicidal, results) # 0.8135
## Area under the curve: 0.8135
colnames(training.subset10)
## [1] "suicidal" "age" "want_word" "person_word"
## [5] "positive_word" "die_word" "anymore_word" "life_word"
## [9] "fucking_word" "sec_pronouns" "job_words"
#Test set
test.set.for.analysis <- test.set %>%
select(c(suicidal,age,want_word, person_word, positive_word,
die_word,anymore_word, life_word, fucking_word, sec_pronouns,
job_words))
test.predictions <- predict(fit_final,test.set.for.analysis,type="response")
n<-length(test.set.for.analysis$suicidal)
results<-rep(1,n)
for (i in 1:n){
#Classify >0.5 as suicidal
if (test.predictions[i]>=0.5){
results[i]=1
} else {
#Classify <0.5 as not suicidal
results[i]=0
}
}
table(results,test.set.for.analysis$suicidal)
##
## results 0 1
## 0 167 43
## 1 21 113
plot(roc(test.set.for.analysis$suicidal,results),main="ROC curve for test data")
##
## Call:
## roc.default(response = test.set.for.analysis$suicidal, predictor = results)
##
## Data: results in 188 controls (test.set.for.analysis$suicidal 0) < 156 cases (test.set.for.analysis$suicidal 1).
## Area under the curve: 0.8063
auc(test.set.for.analysis$suicidal, results) #0.8063
## Area under the curve: 0.8063